librarian::shelf(
  caret, dismo, dplyr, DT, GADMTools, GGally, ggplot2, here, htmltools, leaflet, mapview, maptools, purrr, ranger, raster, readr, rgbif, rgdal, rJava, rpart, rpart.plot, rsample, pdp, sdmpredictors, skimr, sf, spocc, tidyr, usdm, vip)

select <- dplyr::select # overwrite raster::select
options(readr.show_col_types = FALSE)
# set random seed for reproducibility
set.seed(42)

ggplot2::theme_set(ggplot2::theme_light())

# directory to store data
dir_data <- here("data/sdm")
dir.create(dir_data, showWarnings = F, recursive = T)
pts_env_csv <- file.path(dir_data, "pts_env.csv")
pts_geo       <- file.path(dir_data, "pts.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")
mdl_maxv_rds  <- file.path(dir_data, "mdl_maxent_vif.rds")
redo <- FALSE

Overview

This machine learning analysis was completed as an assignment for my Master’s program course, Environmental Data Science 232: Machine Learning. It was assigned by our professor, Dr. Ben Best, as an introduction to machine learning by predicting presence of a chosen species from observations and environmental data found on the Global Biodiversity Information Facility site. It follows guidance found at Species distribution modeling | R Spatial.

My chosen species is coyote brush (Baccharis pilularis). Baccharis pilularis is native to the west coast of the United States (Oregon, California, and Baja California, Mexico). It is a shurb in the Asteraceae (Sunflower) family with oblanceolate to obovate toothed leaves, panicle-like inflorescence with staminate flowers that when mature mimic snow, and generally sticky (not a pun) [@jepson:bp].

Baccharis pilularis Image Credit: CalScape

Explore

The first step in this machine learning exercise is to download observation data of Baccharis pilularis from the Global Biodiversity Information Facility site.

Aquire species observations

obs_csv <- file.path(dir_data, "obs.csv")
obs_geo <- file.path(dir_data, "obs.geojson")
if (!file.exists(obs_geo) | redo){
  # get species occurrence data from GBIF with coordinates
  (res <- spocc::occ(
    query = 'Baccharis pilularis', 
    from = 'gbif', 
    has_coords = T,
    limit = 10000))
  
  # extract data frame from result
  df <- res$gbif$data[[1]] 
  readr::write_csv(df, obs_csv)
  
  # convert to points of observation from lon/lat columns in data frame
  obs <- df %>% 
    sf::st_as_sf(
      coords = c("longitude", "latitude"),
      crs = st_crs(4326)) %>% 
    select(prov, key) # save space (joinable from obs_csv)
  sf::write_sf(obs, obs_geo, delete_dsn=T)
}
obs <- sf::read_sf(obs_geo)
nrow(obs) # number of rows
## [1] 10000
# show points on map
mapview::mapview(obs, map.types = "CartoDB.Voyager")
obs$key <- as.factor(obs$key)

# count number of observations
obs_num <- nrow(obs)

# Check for duplicates - creates a vector of T or F for each of the points ???should you use 'key' vs 'geom'???
dups <- duplicated(obs$key)

# how many duplicates were there? This will sum only the TRUE values
sum(dups)
## [1] 0
# create lon and lat columns in preparation to clean inaccurate data points

obs <- obs %>%
  dplyr::mutate(lon = sf::st_coordinates(.)[,1],
                lat = sf::st_coordinates(.)[,2])

usa <- gadm_sf_loadCountries("USA", level = 2, basefile = "data/")
  • Question 1. How many observations total are in GBIF for your species?

There are 10000` observations for Baccharis pilularis in this data. According to the iNaturalist site, over 19,000 observations have been uploaded of this species.

  • Question 2. Do you see any odd observations, like marine species on land or vice versa?

There were only a few observably inaccurate data points for this species. These points seemingly fall in the ocean along the coastline. I chose not to remove these points as there may be inaccuracies on the basemap chosen and these points may actually be on land. I also did not want to lose potential valuable environmental data.

Aquire environmental data

The next step is to use the Species Distribution Model predictors R package sdmpredictors to get underlying environmental data for Baccharis pilularis observations.

Environmental data
dir_env <- file.path(dir_data, "env")

# set a default data directory
options(sdmpredictors_datadir = dir_env)

# choosing terrestrial
env_datasets <- sdmpredictors::list_datasets(terrestrial = TRUE, marine = FALSE)

# show table of datasets
env_datasets %>% 
  select(dataset_code, description, citation) %>% 
  DT::datatable()
# choose datasets for a vector
env_datasets_vec <- c("WorldClim", "ENVIREM")

# get layers
env_layers <- sdmpredictors::list_layers(env_datasets_vec)
DT::datatable(env_layers)
# choose layers after some inspection and perhaps consulting literature
env_layers_vec <- c("WC_alt", "WC_bio1", "WC_bio6", "WC_bio12", "ER_PETseasonality", "ER_topoWet", "ER_climaticMoistureIndex")

# get layers
env_stack <- load_layers(env_layers_vec)

# interactive plot layers, hiding all but first (select others)
# mapview(env_stack, hide = T) # makes the html too big for Github
plot(env_stack, nc=2)

  • Question 3. What environmental layers did you choose as predictors? Can you find any support for these in the literature?

  • WC_alt = altitude. This was chosen due to the limited elevation range listed on the CalFlora site, which states that coyote brush has been documented between -3 to 2310 meters (CalFlora 2021).

  • WC_bio1 = annual mean temperature. This is a fundamental environmental indicator so it has been included.

  • WC_bio6 = minimum temperature of the coldest month Baccharis pilularis is a distributed throughout the California Floristic Province, which is a mediterranean climate (Calsbeek 2003). This climatic region does not have prolonged freezing temperatures, so I am interested in analyzing the influence of minimum cold temperature on the habitat range of coyote brush.

  • WC_bio12 = annual precipitation

  • ER_PETseasonality = monthly variability in potential evapotranspiration Coyote brush uptakes fog water, so I am curious to analyze if evapotranspiration rates influences its distribution (Emery 2018). I anticipate that areas with higher rates of evapotranspiration are not as suitable for B. pilularis.

  • ER_topoWet = topographic wetness index this is a proxy for soil wetness, and can characterize biological processes such as annual net primary production, vegetation patterns, and forest site quality. Coyote brush is found from wetlands to chaparral. I am interested to see if soil moisture influences the location of coyote brush.

  • ER_climaticMoistureIndex = climatic moisture index. This is a metric of relative wetness and aridity. It is calculated as the difference between annual precipitation and potential evapotranspiration (Vorosmarty 2005).

Region of Interest

The environmental data is on a global scale. Here we crop the environmental rasters to a region of interest around the distribution of Baccharis pilularis.

obs_hull_geo  <- file.path(dir_data, "obs_hull.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")

if (!file.exists(obs_hull_geo) | redo){
  # make convex hull around points of observation
  obs_hull <- sf::st_convex_hull(st_union(obs))
  
  # save obs hull
  write_sf(obs_hull, obs_hull_geo)
}
obs_hull <- read_sf(obs_hull_geo)

# show points on map
mapview(
  list(obs, obs_hull))
if (!file.exists(env_stack_grd) | redo){
  obs_hull_sp <- sf::as_Spatial(obs_hull)
  env_stack <- raster::mask(env_stack, obs_hull_sp) %>% 
    raster::crop(extent(obs_hull_sp))
  writeRaster(env_stack, env_stack_grd, overwrite = T)  
}
env_stack <- stack(env_stack_grd)

# show map
# mapview(obs) + 
#   mapview(env_stack, hide = T) # makes html too big for Github
plot(env_stack, nc=2)

Pseudo-Absence

absence_geo <- file.path(dir_data, "absence.geojson")
pts_geo     <- file.path(dir_data, "pts.geojson")
pts_env_csv <- file.path(dir_data, "pts_env.csv")

if (!file.exists(absence_geo) | redo){
  # get raster count of observations
  r_obs <- rasterize(
    sf::as_Spatial(obs), env_stack[[1]], field=1, fun='count')
  
  # create mask for 
  r_mask <- mask(env_stack[[1]] > -Inf, r_obs, inverse=T)
  
  # generate random points inside mask
  absence <- dismo::randomPoints(r_mask, nrow(obs)) %>% 
    as_tibble() %>% 
    st_as_sf(coords = c("x", "y"), crs = 4326)
  
  write_sf(absence, absence_geo, delete_dsn=T)
}
absence <- read_sf(absence_geo)

# show map of presence, ie obs, and absence
mapview(absence, col.regions = "gray") +
  mapview(obs, col.regions = "green")
if (!file.exists(pts_env_csv) | redo){

  # combine presence and absence into single set of labeled points 
  pts <- rbind(
    obs %>% 
      mutate(
        present = 1) %>% 
      select(present, key),
    absence %>% 
      mutate(
        present = 0,
        key     = NA)) %>% 
    mutate(
      ID = 1:n()) %>% 
    relocate(ID)
  write_sf(pts, pts_geo, delete_dsn=T)

  # extract raster values for points
  pts_env <- raster::extract(env_stack, as_Spatial(pts), df=TRUE) %>% 
    tibble() %>% 
    # join present and geometry columns to raster value results for points
    left_join(
      pts %>% 
        select(ID, present),
      by = "ID") %>% 
    relocate(present, .after = ID) %>% 
    # extract lon, lat as single columns
    mutate(
      #present = factor(present),
      lon = st_coordinates(geometry)[,1],
      lat = st_coordinates(geometry)[,2]) %>% 
    select(-geometry)
  write_csv(pts_env, pts_env_csv)
}
pts_env <- read_csv(pts_env_csv)

pts_env %>% 
  # show first 10 presence, last 10 absence
  slice(c(1:10, (nrow(pts_env)-9):nrow(pts_env))) %>% 
  DT::datatable(
    rownames = F,
    options = list(
      dom = "t",
      pageLength = 20))

In the end this table is the data that feeds into our species distribution model (y ~ X), where:

  • y is the present column with values of 1 (present) or 0 (absent)
  • X is all other columns: WC_alt, WC_bio1, WC_bio6, WC_bio12, ER_PETseasonality, ER_topoWet, ER_climaticMoistureIndex, lon, lat

Term Plots

In the vein of exploratory data analyses, before going into modeling let’s look at the data. Specifically, let’s look at how obviously differentiated is the presence versus absence for each predictor – a more pronounced presence peak should make for a more confident model. A plot for a specific predictor and response is called a “term plot”. In this case we’ll look for predictors where the presence (present = 1) occupies a distinct “niche” from the background absence points (present = 0).

pts_env %>% 
  select(-ID) %>% 
  mutate(
    present = factor(present)) %>% 
  pivot_longer(-present) %>% 
  ggplot() +
  geom_density(aes(x = value, fill = present)) + 
  scale_fill_manual(values = alpha(c("gray", "green"), 0.5)) +
  scale_x_continuous(expand=c(0,0)) +
  scale_y_continuous(expand=c(0,0)) +
  theme_bw() + 
  facet_wrap(~name, scales = "free") +
  labs(title = "Baccharis pilularis Term Plots") +
  theme(
    legend.position = c(1, 0),
    legend.justification = c(1, 0))

Section B: Explore (cont’d)

Pairs Plot

Show correlations between variables.

pts_env <- read_csv(pts_env_csv)
nrow(pts_env)
## [1] 20000
datatable(pts_env, rownames = F)
GGally::ggpairs(
  select(pts_env, -ID),
  aes(color = factor(present), alpha = 0.5))
Pairs plot with `present` color coded.

Pairs plot with present color coded.

Logistic Regression

Setup Data

Let’s setup a data frame with only the data we want to model by:

  • Dropping rows with any NAs. Later we’ll learn how to “impute” values with guesses so as to not throw away data.
  • Removing terms we don’t want to model. We can then use a simplified formula \(present \sim .\) to predict \(present\) based on all other fields in the data frame (i.e. the \(X\)`s in \(y \sim x_1 + x_2 + ... x_n\)).
# setup model data
d <- pts_env %>% 
  select(-ID) %>%  # remove terms we don't want to model
  tidyr::drop_na() # drop the rows with NA values
nrow(d)
## [1] 19955

Linear Model

Let’s start as simply as possible with a linear model lm() on multiple predictors X to predict presence y using a simpler workflow.

Simpler workflow with only fit and predict of all data, i.e. no splitting.

# fit a linear model
mdl <- lm(present ~ ., data = d)
summary(mdl)
## 
## Call:
## lm(formula = present ~ ., data = d)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.34125 -0.09623  0.02608  0.14098  0.84156 
## 
## Coefficients:
##                              Estimate   Std. Error t value             Pr(>|t|)
## (Intercept)               1.014473188  0.212760200   4.768  0.00000187235038973
## WC_alt                   -0.000407896  0.000012364 -32.990 < 0.0000000000000002
## WC_bio1                  -0.108523028  0.003334510 -32.545 < 0.0000000000000002
## WC_bio6                   0.050921864  0.002776725  18.339 < 0.0000000000000002
## WC_bio12                  0.000168568  0.000021199   7.952  0.00000000000000193
## ER_PETseasonality        -0.000023579  0.000005206  -4.529  0.00000595603074086
## ER_topoWet                0.004080451  0.001809388   2.255               0.0241
## ER_climaticMoistureIndex -0.437734442  0.026798699 -16.334 < 0.0000000000000002
## lon                      -0.027553639  0.001508587 -18.265 < 0.0000000000000002
## lat                      -0.063156676  0.001668332 -37.856 < 0.0000000000000002
##                             
## (Intercept)              ***
## WC_alt                   ***
## WC_bio1                  ***
## WC_bio6                  ***
## WC_bio12                 ***
## ER_PETseasonality        ***
## ER_topoWet               *  
## ER_climaticMoistureIndex ***
## lon                      ***
## lat                      ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2474 on 19945 degrees of freedom
## Multiple R-squared:  0.7552, Adjusted R-squared:  0.7551 
## F-statistic:  6838 on 9 and 19945 DF,  p-value: < 0.00000000000000022
y_predict <- predict(mdl, d, type="response")
y_true    <- d$present
range(y_predict)
## [1] -0.3075928  1.3412458
range(y_true)
## [1] 0 1

The problem with these predictions is that it ranges outside the possible values of present 1 and absent 0. (Later we’ll deal with converting values within this range to either 1 or 0 by applying a cutoff value; i.e. any values > 0.5 become 1 and below become 0.)

Generalized Linear Model

To solve this problem of constraining the response term to being between the two possible values, i.e. the probability \(p\) of being one or the other possible \(y\) values, we’ll apply the logistic transformation on the response term.

\[ logit(p_i) = \log_{e}\left( \frac{p_i}{1-p_i} \right) \]

We can expand the expansion of the predicted term, i.e. the probability \(p\) of being either \(y\), with all possible predictors \(X\) whereby each coeefficient \(b\) gets multiplied by the value of \(x\):

\[ \log_{e}\left( \frac{p_i}{1-p_i} \right) = b_0 + b_1 x_{1,i} + b_2 x_{2,i} + \cdots + b_k x_{k,i} \]

# fit a generalized linear model with a binomial logit link function
mdl <- glm(present ~ ., family = binomial(link="logit"), data = d)
summary(mdl)
## 
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"), 
##     data = d)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.8268  -0.0026   0.0000   0.1576   3.6605  
## 
## Coefficients:
##                              Estimate   Std. Error z value             Pr(>|z|)
## (Intercept)              -191.4745580    8.8475632 -21.642 < 0.0000000000000002
## WC_alt                      0.0027073    0.0003900   6.942     0.00000000000387
## WC_bio1                    -0.1463873    0.0940623  -1.556              0.11964
## WC_bio6                     1.7503783    0.0888087  19.710 < 0.0000000000000002
## WC_bio12                    0.0034345    0.0004891   7.022     0.00000000000219
## ER_PETseasonality          -0.0001212    0.0001436  -0.844              0.39865
## ER_topoWet                 -0.3508127    0.0504855  -6.949     0.00000000000368
## ER_climaticMoistureIndex   -8.4321597    0.6892473 -12.234 < 0.0000000000000002
## lon                        -1.5967077    0.0671791 -23.768 < 0.0000000000000002
## lat                        -0.1542601    0.0516567  -2.986              0.00282
##                             
## (Intercept)              ***
## WC_alt                   ***
## WC_bio1                     
## WC_bio6                  ***
## WC_bio12                 ***
## ER_PETseasonality           
## ER_topoWet               ***
## ER_climaticMoistureIndex ***
## lon                      ***
## lat                      ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27663.4  on 19954  degrees of freedom
## Residual deviance:  4201.4  on 19945  degrees of freedom
## AIC: 4221.4
## 
## Number of Fisher Scoring iterations: 9
y_predict <- predict(mdl, d, type="response")

range(y_predict)
## [1] 0.0000000000000002220446 0.9999927741439832429293

Excellent, our response is now constrained between 0 and 1. Next, let’s look at the term plots to see the relationship between predictor and response.

# show term plots
termplot(mdl, partial.resid = TRUE, se = TRUE, main = F, ylim="free")

Generalized Additive Model

With a generalized additive model we can add “wiggle” to the relationship between predictor and response by introducing smooth s() terms.

librarian::shelf(mgcv)

# fit a generalized additive model with smooth predictors
mdl <- mgcv::gam(
  formula = present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio6) + s(WC_bio12) + 
    s(ER_PETseasonality) + s(ER_topoWet) + s(ER_climaticMoistureIndex) + s(lon) + s(lat),
  family = binomial, data = d)
summary(mdl)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio6) + s(WC_bio12) + 
##     s(ER_PETseasonality) + s(ER_topoWet) + s(ER_climaticMoistureIndex) + 
##     s(lon) + s(lat)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   -23.37      13.79  -1.695   0.0901 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                               edf Ref.df Chi.sq              p-value    
## s(WC_alt)                   3.177  3.732  37.05          0.000000904 ***
## s(WC_bio1)                  8.030  8.165  93.02 < 0.0000000000000002 ***
## s(WC_bio6)                  5.937  6.076  92.63 < 0.0000000000000002 ***
## s(WC_bio12)                 8.165  8.751  36.35          0.000027928 ***
## s(ER_PETseasonality)        5.510  6.701  48.52 < 0.0000000000000002 ***
## s(ER_topoWet)               7.705  8.501  25.14              0.00218 ** 
## s(ER_climaticMoistureIndex) 8.345  8.816  26.98              0.00284 ** 
## s(lon)                      6.183  6.804 110.73 < 0.0000000000000002 ***
## s(lat)                      8.920  8.995 117.03 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.921   Deviance explained = 90.1%
## UBRE = -0.85613  Scale est. = 1         n = 19955
# show term plots
plot(mdl, scale=0)

Question: Which GAM environmental variables, and even range of values, seem to contribute most towards presence (above 0 response) versus absence (below 0 response)?

The variables most likely to contribute to presence from the 7 I chose originally are ER_topoWet and ER-climaticMoistureIndex. For ER-climaticMoistureIndex, the higher the index number, specifically above 0, the higher likelihood of coyote brush presence. ER_topoWet is less conclusive as the higher likelihood of presence is in the value range less than 8, but there is a wide confidence interval.

Maxent (Maximum Entropy)

Maxent is probably the most commonly used species distribution model (Elith 2011) since it performs well with few input data points, only requires presence points (and samples background for comparison) and is easy to use with a Java graphical user interface (GUI).

# load extra packages
librarian::shelf(
  maptools, sf)

mdl_maxent_rds <- file.path(dir_data, "mdl_maxent.rds")

# show version of maxent
if (!interactive())
  maxent()
## This is MaxEnt version 3.4.3
# get environmental rasters
# NOTE: the first part of Lab 1. SDM - Explore got updated to write this clipped environmental raster stack
env_stack_grd <- file.path(dir_data, "env_stack.grd")
env_stack <- stack(env_stack_grd)
plot(env_stack, nc=2)

# get presence-only observation points (maxent extracts raster values for you)
obs_geo <- file.path(dir_data, "obs.geojson")
obs_sp <- read_sf(obs_geo) %>% 
  sf::as_Spatial() # maxent prefers sp::SpatialPoints over newer sf::sf class

# fit a maximum entropy model
if (!file.exists(mdl_maxent_rds) | redo){
  mdl <- maxent(env_stack, obs_sp)
  readr::write_rds(mdl, mdl_maxent_rds)
}
mdl <- read_rds(mdl_maxent_rds)

# plot variable contributions per predictor
plot(mdl)

# plot term plots
response(mdl)

Question: Which Maxent environmental variables, and even range of values, seem to contribute most towards presence (closer to 1 response) and how might this differ from the GAM results?

Altitude and ER_climaticVariableMoisture variables contribute the most towards presence of coyote brush. The ER_climaticVariableMoisture was one of the main predicators in the GAM model. As for why altitude was not as obviously a primary presence predictor is potentially due to the scale of the GAM plot. It was difficult to intepret the presence weight on the WC_alt GAM plot as the elevation range on the x-axis was more extensive than other variable x-axes.

# predict
y_predict <- predict(env_stack, mdl) #, ext=ext, progress='')

plot(y_predict, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')

Section C. Decision Trees

d <- pts_env %>% 
  select(-ID) %>%                   # not used as a predictor x
  mutate(
    present = factor(present)) %>%  # categorical response
  na.omit()                         # drop rows with NA
skim(d)
Data summary
Name d
Number of rows 19955
Number of columns 10
_______________________
Column type frequency:
factor 1
numeric 9
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
present 0 1 FALSE 2 0: 9998, 1: 9957

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
WC_alt 0 1 658.73 703.94 -76.00 94.00 291.00 1270.00 3636.00 ▇▂▂▁▁
WC_bio1 0 1 13.47 3.94 -0.70 11.10 13.80 16.00 23.80 ▁▃▇▇▂
WC_bio6 0 1 0.75 5.48 -14.20 -3.40 2.90 4.80 9.90 ▁▃▂▇▅
WC_bio12 0 1 555.92 404.27 47.00 293.00 445.00 686.00 2985.00 ▇▂▁▁▁
ER_PETseasonality 0 1 4970.80 1368.69 1819.55 3706.00 5241.52 6188.03 7648.14 ▂▆▅▇▃
ER_topoWet 0 1 10.38 1.35 6.83 9.45 10.32 11.16 15.04 ▁▇▇▂▁
ER_climaticMoistureIndex 0 1 -0.52 0.36 -0.97 -0.76 -0.63 -0.32 0.78 ▇▅▂▁▁
lon 0 1 -119.01 3.90 -124.56 -122.23 -120.12 -116.96 -106.46 ▇▅▃▂▁
lat 0 1 37.17 3.36 30.21 34.36 37.39 38.57 47.29 ▃▆▇▂▁

Split data into training and testing

# create training set with 80% of full data
d_split  <- rsample::initial_split(d, prop = 0.8, strata = "present")
d_train  <- rsample::training(d_split)

# show number of rows present is 0 vs 1
table(d$present)
## 
##    0    1 
## 9998 9957
table(d_train$present)
## 
##    0    1 
## 7998 7965

Decision Trees

Partition, depth=1

# run decision stump model
mdl <- rpart(
  present ~ ., data = d_train, 
  control = list(
    cp = 0, minbucket = 5, maxdepth = 1))
mdl
## n= 15963 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 15963 7965 0 (0.50103364 0.49896636)  
##   2) WC_bio6< 2.35 7150  450 0 (0.93706294 0.06293706) *
##   3) WC_bio6>=2.35 8813 1298 1 (0.14728242 0.85271758) *
# plot tree 
par(mar = c(1, 1, 1, 1))
rpart.plot(mdl)

Partition, depth=default

# decision tree with defaults
mdl <- rpart(present ~ ., data = d_train)
mdl
## n= 15963 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 15963 7965 0 (0.501033640 0.498966360)  
##   2) WC_bio6< 2.35 7150  450 0 (0.937062937 0.062937063) *
##   3) WC_bio6>=2.35 8813 1298 1 (0.147282424 0.852717576)  
##     6) lon>=-116.8832 1046    6 0 (0.994263862 0.005736138) *
##     7) lon< -116.8832 7767  258 1 (0.033217458 0.966782542) *
rpart.plot(mdl)

# plot complexity parameter
plotcp(mdl)

# rpart cross validation results
mdl$cptable
##          CP nsplit  rel error     xerror        xstd
## 1 0.7805399      0 1.00000000 1.02146893 0.007929761
## 2 0.1298180      1 0.21946014 0.21946014 0.004953374
## 3 0.0100000      2 0.08964218 0.09001883 0.003285447

Question: Based on the complexity plot threshold, what size of tree is recommended? Recommended tree size is 3.

Feature interpretation

# caret cross validation results
mdl_caret <- train(
  present ~ .,
  data       = d_train,
  method     = "rpart",
  trControl  = trainControl(method = "cv", number = 10),
  tuneLength = 20)

ggplot(mdl_caret)
Cross-validated accuracy rate for the 20 different $\alpha$ parameter values in our grid search. Lower $\alpha$ values (deeper trees) help to minimize errors.

Cross-validated accuracy rate for the 20 different \(\alpha\) parameter values in our grid search. Lower \(\alpha\) values (deeper trees) help to minimize errors.

vip(mdl_caret, num_features = 40, bar = FALSE)
Variable importance based on the total reduction in MSE for the Ames Housing decision tree.

Variable importance based on the total reduction in MSE for the Ames Housing decision tree.

Question: what are the top 3 most important variables of your model? ER_PETseasonality, WC_bio6, WC_alt

# Construct partial dependence plots
p1 <- partial(mdl_caret, pred.var = "ER_PETseasonality") %>% autoplot()
p2 <- partial(mdl_caret, pred.var = "WC_bio6") %>% autoplot()
p3 <- partial(mdl_caret, pred.var = c("ER_PETseasonality", "WC_bio6")) %>% 
  plotPartial(levelplot = FALSE, zlab = "yhat", drape = TRUE, 
              colorkey = TRUE, screen = list(z = -20, x = -60))

# Display plots side by side
gridExtra::grid.arrange(p1, p2, p3, ncol = 3)
Partial dependence plots to understand the relationship between ER_PETseasonality, WC_bio6 and present.

Partial dependence plots to understand the relationship between ER_PETseasonality, WC_bio6 and present.

Random Forests

Fit

# number of features
n_features <- length(setdiff(names(d_train), "present"))

# fit a default random forest model
mdl_rf <- ranger(present ~ ., data = d_train)

# get out of the box RMSE
(default_rmse <- sqrt(mdl_rf$prediction.error))
## [1] 0.1302949

Feature interpretation

# re-run model with impurity-based variable importance
mdl_impurity <- ranger(
  present ~ ., data = d_train,
  importance = "impurity")

# re-run model with permutation-based variable importance
mdl_permutation <- ranger(
  present ~ ., data = d_train,
  importance = "permutation")
p1 <- vip::vip(mdl_impurity, bar = FALSE)
p2 <- vip::vip(mdl_permutation, bar = FALSE)

gridExtra::grid.arrange(p1, p2, nrow = 1)
Most important variables based on impurity (left) and permutation (right).

Most important variables based on impurity (left) and permutation (right).

Question: How might variable importance differ between rpart and RandomForest in your model outputs? RandomForest it creates a “forest” of decision trees where r part creates a single decision tree. RandomForest reduces model variance, which has changed the importance of environmental variables predicting presence of coyote brush. The most important varible with RandomForest is WC_bio6.

Section D. Evaluate Models

# read points of observation: presence (1) and absence (0)
pts <- read_sf(pts_geo)

# read raster stack of environment
env_stack <- raster::stack(env_stack_grd)

Split observations into training and testing

# create training set with 80% of full data
pts_split  <- rsample::initial_split(
  pts, prop = 0.8, strata = "present")
pts_train  <- rsample::training(pts_split)
pts_test   <- rsample::testing(pts_split)

pts_train_p <- pts_train %>% 
  filter(present == 1) %>% 
  as_Spatial()
pts_train_a <- pts_train %>% 
  filter(present == 0) %>% 
  as_Spatial()

Calibrate: Model Selection

# show pairs plot before multicollinearity reduction with vifcor()
pairs(env_stack)

# calculate variance inflation factor per predictor, a metric of multicollinearity between variables
vif(env_stack)
##                  Variables       VIF
## 1                   WC_alt  9.442294
## 2                  WC_bio1 25.760982
## 3                  WC_bio6 38.308369
## 4                 WC_bio12 27.701658
## 5        ER_PETseasonality  3.981612
## 6               ER_topoWet  1.938013
## 7 ER_climaticMoistureIndex 34.672414
# stepwise reduce predictors, based on a max correlation of 0.7 (max 1)
v <- vifcor(env_stack, th=0.7) 
v
## 3 variables from the 7 input variables have collinearity problem: 
##  
## ER_climaticMoistureIndex WC_bio6 WC_bio1 
## 
## After excluding the collinear variables, the linear correlation coefficients ranges between: 
## min correlation ( ER_PETseasonality ~ WC_alt ):  -0.0618899 
## max correlation ( ER_PETseasonality ~ WC_bio12 ):  -0.5180566 
## 
## ---------- VIFs of the remained variables -------- 
##           Variables      VIF
## 1            WC_alt 1.341598
## 2          WC_bio12 1.706693
## 3 ER_PETseasonality 1.465038
## 4        ER_topoWet 1.724998
# reduce environmental raster stack by 
env_stack_v <- usdm::exclude(env_stack, v)

# show pairs plot after multicollinearity reduction with vifcor()
pairs(env_stack_v)

# fit a maximum entropy model
if (!file.exists(mdl_maxv_rds)){
  mdl_maxv <- maxent(env_stack_v, sf::as_Spatial(pts_train))
  readr::write_rds(mdl_maxv, mdl_maxv_rds)
}

mdl_maxv <- read_rds(mdl_maxv_rds)
# plot variable contributions per predictor
plot(mdl_maxv)

# plot term plots
response(mdl_maxv)

Question: Which variables were removed due to multicollinearity and what is the rank of most to least important remaining variables in your model?

WC_bio1, WC_bio6, ER_climaticMoistureIndex were removed. For the environmental variable ranking of most to least important variable for species presence is:
- ER_PETseasonality - WC_alt - WC_bio12 - ER_topoWet

# predict
y_maxv <- predict(env_stack, mdl_maxv) #, ext=ext, progress='')

plot(y_maxv, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')

Evaluate: Model Performance

Area Under the Curve (AUC), Reciever Operater Characteristic (ROC) Curve and Confusion Matrix

pts_test_p <- pts_test %>% 
  filter(present == 1) %>% 
  as_Spatial()
pts_test_a <- pts_test %>% 
  filter(present == 0) %>% 
  as_Spatial()

y_maxv <- predict(mdl_maxv, env_stack)
#plot(y_maxv)

e <- dismo::evaluate(
  p     = pts_test_p,
  a     = pts_test_a, 
  model = mdl_maxv,
  x     = env_stack)
e
## class          : ModelEvaluation 
## n presences    : 1992 
## n absences     : 2000 
## AUC            : 0.9806604 
## cor            : 0.8520653 
## max TPR+TNR at : 0.6677061
plot(e, 'ROC')

thr <- threshold(e)[['spec_sens']]
thr
## [1] 0.6677061
p_true <- na.omit(raster::extract(y_maxv, pts_test_p) >= thr)
a_true <- na.omit(raster::extract(y_maxv, pts_test_a) < thr)

# (t)rue/(f)alse (p)ositive/(n)egative rates
tpr <- sum(p_true)/length(p_true)
fnr <- sum(!p_true)/length(p_true)
fpr <- sum(!a_true)/length(a_true)
tnr <- sum(a_true)/length(a_true)

matrix(
  c(tpr, fnr,
    fpr, tnr), 
  nrow=2, dimnames = list(
    c("present_obs", "absent_obs"),
    c("present_pred", "absent_pred")))
##             present_pred absent_pred
## present_obs   0.93624498      0.0685
## absent_obs    0.06375502      0.9315
# add point to ROC plot
points(fpr, tpr, pch=23, bg="blue")

plot(y_maxv > thr)

References

Bogler, David. 2012. Baccharis pilularis. Jepson Flora Project (eds.) Jepson eFlora. https://ucjeps.berkeley.edu/eflora/eflora_display.php?tid=1603. Last accessed on January 08, 2022

CalFlora. 2021. Plant Location Suitability - Baccharis pilularis. https://www.calflora.org/entry/compare.html?crn=1031. Last accessed 25 January 2022

Vorosmarty, C.J., Douglas, E.M., Green, P.A., Revenda, C. 2005. Geospatial indicators of emerging water stress: An application to Africa. Ambio. 34:3. 230 - 236.

Emery, Nathan C., D’Antonio, Carla M., Still, Christopher J. 2018. Fog and live fuel moisture in coastal California shrublands. Ecosphere. 9:14. https://doi.org/10.1002/ecs2.2167

Calsbeek, Ryan, Thompson, John, and Richardson, James. 2003. Patterns of molecular evolution and diversification in a biodiversity hotspot: the California Floristic Province. Molecular Ecology. 12. 1021-1029.